Assignment 04

Author
Affiliation

Makenzie Howard

Boston University

Published

October 6, 2025

Modified

October 6, 2025

1 Load the dataset

#echo: true
#eval: true
from pyspark.sql import SparkSession
import pandas as pd
import plotly.express as px
import plotly.io as pio
import numpy as np

np.random.seed(42)

pio.renderers.default = "notebook+notebook_connected+vscode"

# Initialize Spark Session
spark = SparkSession.builder.appName("LightcastData").getOrCreate()

# Load Data
df = spark.read.option("header", "true").option("inferSchema", "true").option("multiLine","true").option("escape", "\"").csv("./data/lightcast_job_postings.csv")

# Show Schema and Sample Data
print("---This is Diagnostic check, No need to print it in the final doc---")

# df.printSchema() # comment this line when rendering the submission
df.show(5)
[Stage 74:>                                                         (0 + 1) / 1][Stage 74:==========================================================(1 + 0) / 1]                                                                                
---This is Diagnostic check, No need to print it in the final doc---
+--------------------+-----------------+----------------------+----------+--------+---------+--------+--------------------+--------------------+--------------------+-----------+-------------------+--------------------+--------------------+---------------+----------------+--------+--------------------+-----------+-------------------+----------------+---------------------+-------------+-------------------+-------------+------------------+---------------+--------------------+--------------------+--------------------+-------------+------+-----------+----------------+-------------------+---------+-----------+--------------------+--------------------+-------------+------+--------------+-----+--------------------+-----+----------+---------------+--------------------+---------------+--------------------+------------+--------------------+------------+--------------------+------+--------------------+------+--------------------+------+--------------------+------+--------------------+------+--------------------+------------------+-------------------+--------------------+--------------------+--------------------+--------------------+-----------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+--------------------+----------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+--------------------+----------+--------------------+----------+---------------+----------+---------------+---------------+--------------------+--------------+--------------------+--------------------------+-------------------------------+--------------------+-------------------------+-----------------------------+----------------------------------+-----------------+----------------------+-----------------------+----------------------------+------------------+-----------------------+-------+--------------------+-------+--------------------+-------+---------------+-------+---------------+-----------------+----------------------+------------+--------------------+------------+--------------------+------------+--------------------+------------+--------------------+------------+--------------------+
|                  ID|LAST_UPDATED_DATE|LAST_UPDATED_TIMESTAMP|DUPLICATES|  POSTED|  EXPIRED|DURATION|        SOURCE_TYPES|             SOURCES|                 URL|ACTIVE_URLS|ACTIVE_SOURCES_INFO|           TITLE_RAW|                BODY|MODELED_EXPIRED|MODELED_DURATION| COMPANY|        COMPANY_NAME|COMPANY_RAW|COMPANY_IS_STAFFING|EDUCATION_LEVELS|EDUCATION_LEVELS_NAME|MIN_EDULEVELS| MIN_EDULEVELS_NAME|MAX_EDULEVELS|MAX_EDULEVELS_NAME|EMPLOYMENT_TYPE|EMPLOYMENT_TYPE_NAME|MIN_YEARS_EXPERIENCE|MAX_YEARS_EXPERIENCE|IS_INTERNSHIP|SALARY|REMOTE_TYPE|REMOTE_TYPE_NAME|ORIGINAL_PAY_PERIOD|SALARY_TO|SALARY_FROM|            LOCATION|                CITY|    CITY_NAME|COUNTY|   COUNTY_NAME|  MSA|            MSA_NAME|STATE|STATE_NAME|COUNTY_OUTGOING|COUNTY_NAME_OUTGOING|COUNTY_INCOMING|COUNTY_NAME_INCOMING|MSA_OUTGOING|   MSA_NAME_OUTGOING|MSA_INCOMING|   MSA_NAME_INCOMING|NAICS2|         NAICS2_NAME|NAICS3|         NAICS3_NAME|NAICS4|         NAICS4_NAME|NAICS5|         NAICS5_NAME|NAICS6|         NAICS6_NAME|             TITLE|         TITLE_NAME|         TITLE_CLEAN|              SKILLS|         SKILLS_NAME|  SPECIALIZED_SKILLS|SPECIALIZED_SKILLS_NAME|      CERTIFICATIONS| CERTIFICATIONS_NAME|       COMMON_SKILLS|  COMMON_SKILLS_NAME|     SOFTWARE_SKILLS|SOFTWARE_SKILLS_NAME|      ONET|           ONET_NAME| ONET_2019|      ONET_2019_NAME|                CIP6|           CIP6_NAME|                CIP4|           CIP4_NAME|                CIP2|           CIP2_NAME|SOC_2021_2|     SOC_2021_2_NAME|SOC_2021_3|     SOC_2021_3_NAME|SOC_2021_4|SOC_2021_4_NAME|SOC_2021_5|SOC_2021_5_NAME|LOT_CAREER_AREA|LOT_CAREER_AREA_NAME|LOT_OCCUPATION| LOT_OCCUPATION_NAME|LOT_SPECIALIZED_OCCUPATION|LOT_SPECIALIZED_OCCUPATION_NAME|LOT_OCCUPATION_GROUP|LOT_OCCUPATION_GROUP_NAME|LOT_V6_SPECIALIZED_OCCUPATION|LOT_V6_SPECIALIZED_OCCUPATION_NAME|LOT_V6_OCCUPATION|LOT_V6_OCCUPATION_NAME|LOT_V6_OCCUPATION_GROUP|LOT_V6_OCCUPATION_GROUP_NAME|LOT_V6_CAREER_AREA|LOT_V6_CAREER_AREA_NAME|  SOC_2|          SOC_2_NAME|  SOC_3|          SOC_3_NAME|  SOC_4|     SOC_4_NAME|  SOC_5|     SOC_5_NAME|LIGHTCAST_SECTORS|LIGHTCAST_SECTORS_NAME|NAICS_2022_2|   NAICS_2022_2_NAME|NAICS_2022_3|   NAICS_2022_3_NAME|NAICS_2022_4|   NAICS_2022_4_NAME|NAICS_2022_5|   NAICS_2022_5_NAME|NAICS_2022_6|   NAICS_2022_6_NAME|
+--------------------+-----------------+----------------------+----------+--------+---------+--------+--------------------+--------------------+--------------------+-----------+-------------------+--------------------+--------------------+---------------+----------------+--------+--------------------+-----------+-------------------+----------------+---------------------+-------------+-------------------+-------------+------------------+---------------+--------------------+--------------------+--------------------+-------------+------+-----------+----------------+-------------------+---------+-----------+--------------------+--------------------+-------------+------+--------------+-----+--------------------+-----+----------+---------------+--------------------+---------------+--------------------+------------+--------------------+------------+--------------------+------+--------------------+------+--------------------+------+--------------------+------+--------------------+------+--------------------+------------------+-------------------+--------------------+--------------------+--------------------+--------------------+-----------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+--------------------+----------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+--------------------+----------+--------------------+----------+---------------+----------+---------------+---------------+--------------------+--------------+--------------------+--------------------------+-------------------------------+--------------------+-------------------------+-----------------------------+----------------------------------+-----------------+----------------------+-----------------------+----------------------------+------------------+-----------------------+-------+--------------------+-------+--------------------+-------+---------------+-------+---------------+-----------------+----------------------+------------+--------------------+------------+--------------------+------------+--------------------+------------+--------------------+------------+--------------------+
|1f57d95acf4dc67ed...|         9/6/2024|  2024-09-06 20:32:...|         0|6/2/2024| 6/8/2024|       6|   [\n  "Company"\n]|[\n  "brassring.c...|[\n  "https://sjo...|         []|               NULL|Enterprise Analys...|31-May-2024\n\nEn...|       6/8/2024|               6|  894731|          Murphy USA| Murphy USA|              false|       [\n  2\n]| [\n  "Bachelor's ...|            2|  Bachelor's degree|         NULL|              NULL|              1|Full-time (> 32 h...|                   2|                   2|        false|  NULL|          0|          [None]|               NULL|     NULL|       NULL|{\n  "lat": 33.20...|RWwgRG9yYWRvLCBBUg==|El Dorado, AR|  5139|     Union, AR|20980|       El Dorado, AR|    5|  Arkansas|           5139|           Union, AR|           5139|           Union, AR|       20980|       El Dorado, AR|       20980|       El Dorado, AR|    44|        Retail Trade|   441|Motor Vehicle and...|  4413|Automotive Parts,...| 44133|Automotive Parts ...|441330|Automotive Parts ...|ET29C073C03D1F86B4|Enterprise Analysts|enterprise analys...|[\n  "KS126DB6T06...|[\n  "Merchandisi...|[\n  "KS126DB6T06...|   [\n  "Merchandisi...|                  []|                  []|[\n  "KS126706DPF...|[\n  "Mathematics...|[\n  "KS440W865GC...|[\n  "SQL (Progra...|15-2051.01|Business Intellig...|15-2051.01|Business Intellig...|[\n  "45.0601",\n...|[\n  "Economics, ...|[\n  "45.06",\n  ...|[\n  "Economics",...|[\n  "45",\n  "27...|[\n  "Social Scie...|   15-0000|Computer and Math...|   15-2000|Mathematical Scie...|   15-2050|Data Scientists|   15-2051|Data Scientists|             23|Information Techn...|        231010|Business Intellig...|                  23101011|           General ERP Analy...|                2310|     Business Intellig...|                     23101011|              General ERP Analy...|           231010|  Business Intellig...|                   2310|        Business Intellig...|                23|   Information Techn...|15-0000|Computer and Math...|15-2000|Mathematical Scie...|15-2050|Data Scientists|15-2051|Data Scientists|        [\n  7\n]|  [\n  "Artificial ...|          44|        Retail Trade|         441|Motor Vehicle and...|        4413|Automotive Parts,...|       44133|Automotive Parts ...|      441330|Automotive Parts ...|
|0cb072af26757b6c4...|         8/2/2024|  2024-08-02 17:08:...|         0|6/2/2024| 8/1/2024|    NULL| [\n  "Job Board"\n]| [\n  "maine.gov"\n]|[\n  "https://job...|         []|               NULL|Oracle Consultant...|Oracle Consultant...|       8/1/2024|            NULL|  133098|Smx Corporation L...|        SMX|               true|      [\n  99\n]| [\n  "No Educatio...|           99|No Education Listed|         NULL|              NULL|              1|Full-time (> 32 h...|                   3|                   3|        false|  NULL|          1|          Remote|               NULL|     NULL|       NULL|{\n  "lat": 44.31...|    QXVndXN0YSwgTUU=|  Augusta, ME| 23011|  Kennebec, ME|12300|Augusta-Watervill...|   23|     Maine|          23011|        Kennebec, ME|          23011|        Kennebec, ME|       12300|Augusta-Watervill...|       12300|Augusta-Watervill...|    56|Administrative an...|   561|Administrative an...|  5613| Employment Services| 56132|Temporary Help Se...|561320|Temporary Help Se...|ET21DDA63780A7DC09| Oracle Consultants|oracle consultant...|[\n  "KS122626T55...|[\n  "Procurement...|[\n  "KS122626T55...|   [\n  "Procurement...|                  []|                  []|                  []|                  []|[\n  "BGSBF3F508F...|[\n  "Oracle Busi...|15-2051.01|Business Intellig...|15-2051.01|Business Intellig...|                  []|                  []|                  []|                  []|                  []|                  []|   15-0000|Computer and Math...|   15-2000|Mathematical Scie...|   15-2050|Data Scientists|   15-2051|Data Scientists|             23|Information Techn...|        231010|Business Intellig...|                  23101012|           Oracle Consultant...|                2310|     Business Intellig...|                     23101012|              Oracle Consultant...|           231010|  Business Intellig...|                   2310|        Business Intellig...|                23|   Information Techn...|15-0000|Computer and Math...|15-2000|Mathematical Scie...|15-2050|Data Scientists|15-2051|Data Scientists|             NULL|                  NULL|          56|Administrative an...|         561|Administrative an...|        5613| Employment Services|       56132|Temporary Help Se...|      561320|Temporary Help Se...|
|85318b12b3331fa49...|         9/6/2024|  2024-09-06 20:32:...|         1|6/2/2024| 7/7/2024|      35| [\n  "Job Board"\n]|[\n  "dejobs.org"\n]|[\n  "https://dej...|         []|               NULL|        Data Analyst|Taking care of pe...|      6/10/2024|               8|39063746|            Sedgwick|   Sedgwick|              false|       [\n  2\n]| [\n  "Bachelor's ...|            2|  Bachelor's degree|         NULL|              NULL|              1|Full-time (> 32 h...|                   5|                NULL|        false|  NULL|          0|          [None]|               NULL|     NULL|       NULL|{\n  "lat": 32.77...|    RGFsbGFzLCBUWA==|   Dallas, TX| 48113|    Dallas, TX|19100|Dallas-Fort Worth...|   48|     Texas|          48113|          Dallas, TX|          48113|          Dallas, TX|       19100|Dallas-Fort Worth...|       19100|Dallas-Fort Worth...|    52|Finance and Insur...|   524|Insurance Carrier...|  5242|Agencies, Brokera...| 52429|Other Insurance R...|524291|    Claims Adjusting|ET3037E0C947A02404|      Data Analysts|        data analyst|[\n  "KS1218W78FG...|[\n  "Management"...|[\n  "ESF3939CE1F...|   [\n  "Exception R...|[\n  "KS683TN76T7...|[\n  "Security Cl...|[\n  "KS1218W78FG...|[\n  "Management"...|[\n  "KS126HY6YLT...|[\n  "Microsoft O...|15-2051.01|Business Intellig...|15-2051.01|Business Intellig...|                  []|                  []|                  []|                  []|                  []|                  []|   15-0000|Computer and Math...|   15-2000|Mathematical Scie...|   15-2050|Data Scientists|   15-2051|Data Scientists|             23|Information Techn...|        231113|Data / Data Minin...|                  23111310|                   Data Analyst|                2311|     Data Analysis and...|                     23111310|                      Data Analyst|           231113|  Data / Data Minin...|                   2311|        Data Analysis and...|                23|   Information Techn...|15-0000|Computer and Math...|15-2000|Mathematical Scie...|15-2050|Data Scientists|15-2051|Data Scientists|             NULL|                  NULL|          52|Finance and Insur...|         524|Insurance Carrier...|        5242|Agencies, Brokera...|       52429|Other Insurance R...|      524291|    Claims Adjusting|
|1b5c3941e54a1889e...|         9/6/2024|  2024-09-06 20:32:...|         1|6/2/2024|7/20/2024|      48| [\n  "Job Board"\n]|[\n  "disabledper...|[\n  "https://www...|         []|               NULL|Sr. Lead Data Mgm...|About this role:\...|      6/12/2024|              10|37615159|         Wells Fargo|Wells Fargo|              false|      [\n  99\n]| [\n  "No Educatio...|           99|No Education Listed|         NULL|              NULL|              1|Full-time (> 32 h...|                   3|                NULL|        false|  NULL|          0|          [None]|               NULL|     NULL|       NULL|{\n  "lat": 33.44...|    UGhvZW5peCwgQVo=|  Phoenix, AZ|  4013|  Maricopa, AZ|38060|Phoenix-Mesa-Chan...|    4|   Arizona|           4013|        Maricopa, AZ|           4013|        Maricopa, AZ|       38060|Phoenix-Mesa-Chan...|       38060|Phoenix-Mesa-Chan...|    52|Finance and Insur...|   522|Credit Intermedia...|  5221|Depository Credit...| 52211|  Commercial Banking|522110|  Commercial Banking|ET2114E0404BA30075|Management Analysts|sr lead data mgmt...|[\n  "KS123QX62QY...|[\n  "Exit Strate...|[\n  "KS123QX62QY...|   [\n  "Exit Strate...|                  []|                  []|[\n  "KS7G6NP6R6L...|[\n  "Reliability...|[\n  "KS4409D76NW...|[\n  "SAS (Softwa...|15-2051.01|Business Intellig...|15-2051.01|Business Intellig...|                  []|                  []|                  []|                  []|                  []|                  []|   15-0000|Computer and Math...|   15-2000|Mathematical Scie...|   15-2050|Data Scientists|   15-2051|Data Scientists|             23|Information Techn...|        231113|Data / Data Minin...|                  23111310|                   Data Analyst|                2311|     Data Analysis and...|                     23111310|                      Data Analyst|           231113|  Data / Data Minin...|                   2311|        Data Analysis and...|                23|   Information Techn...|15-0000|Computer and Math...|15-2000|Mathematical Scie...|15-2050|Data Scientists|15-2051|Data Scientists|        [\n  6\n]|  [\n  "Data Privac...|          52|Finance and Insur...|         522|Credit Intermedia...|        5221|Depository Credit...|       52211|  Commercial Banking|      522110|  Commercial Banking|
|cb5ca25f02bdf25c1...|        6/19/2024|   2024-06-19 07:00:00|         0|6/2/2024|6/17/2024|      15|[\n  "FreeJobBoar...|[\n  "craigslist....|[\n  "https://mod...|         []|               NULL|Comisiones de $10...|Comisiones de $10...|      6/17/2024|              15|       0|        Unclassified|      LH/GM|              false|      [\n  99\n]| [\n  "No Educatio...|           99|No Education Listed|         NULL|              NULL|              3|Part-time / full-...|                NULL|                NULL|        false| 92500|          0|          [None]|               year|   150000|      35000|{\n  "lat": 37.63...|    TW9kZXN0bywgQ0E=|  Modesto, CA|  6099|Stanislaus, CA|33700|         Modesto, CA|    6|California|           6099|      Stanislaus, CA|           6099|      Stanislaus, CA|       33700|         Modesto, CA|       33700|         Modesto, CA|    99|Unclassified Indu...|   999|Unclassified Indu...|  9999|Unclassified Indu...| 99999|Unclassified Indu...|999999|Unclassified Indu...|ET0000000000000000|       Unclassified|comisiones de por...|                  []|                  []|                  []|                     []|                  []|                  []|                  []|                  []|                  []|                  []|15-2051.01|Business Intellig...|15-2051.01|Business Intellig...|                  []|                  []|                  []|                  []|                  []|                  []|   15-0000|Computer and Math...|   15-2000|Mathematical Scie...|   15-2050|Data Scientists|   15-2051|Data Scientists|             23|Information Techn...|        231010|Business Intellig...|                  23101012|           Oracle Consultant...|                2310|     Business Intellig...|                     23101012|              Oracle Consultant...|           231010|  Business Intellig...|                   2310|        Business Intellig...|                23|   Information Techn...|15-0000|Computer and Math...|15-2000|Mathematical Scie...|15-2050|Data Scientists|15-2051|Data Scientists|             NULL|                  NULL|          99|Unclassified Indu...|         999|Unclassified Indu...|        9999|Unclassified Indu...|       99999|Unclassified Indu...|      999999|Unclassified Indu...|
+--------------------+-----------------+----------------------+----------+--------+---------+--------+--------------------+--------------------+--------------------+-----------+-------------------+--------------------+--------------------+---------------+----------------+--------+--------------------+-----------+-------------------+----------------+---------------------+-------------+-------------------+-------------+------------------+---------------+--------------------+--------------------+--------------------+-------------+------+-----------+----------------+-------------------+---------+-----------+--------------------+--------------------+-------------+------+--------------+-----+--------------------+-----+----------+---------------+--------------------+---------------+--------------------+------------+--------------------+------------+--------------------+------+--------------------+------+--------------------+------+--------------------+------+--------------------+------+--------------------+------------------+-------------------+--------------------+--------------------+--------------------+--------------------+-----------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+--------------------+----------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+--------------------+----------+--------------------+----------+---------------+----------+---------------+---------------+--------------------+--------------+--------------------+--------------------------+-------------------------------+--------------------+-------------------------+-----------------------------+----------------------------------+-----------------+----------------------+-----------------------+----------------------------+------------------+-----------------------+-------+--------------------+-------+--------------------+-------+---------------+-------+---------------+-----------------+----------------------+------------+--------------------+------------+--------------------+------------+--------------------+------------+--------------------+------------+--------------------+
only showing top 5 rows

2 Feature Engineering

from pyspark.sql.functions import col, pow  # pow not used here; safe to keep
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml import Pipeline

# Columns to preview for quick EDA
eda_cols = [
    "SALARY",
    "MIN_YEARS_EXPERIENCE", "DURATION",
    "COMPANY_IS_STAFFING", "IS_INTERNSHIP",
    "STATE_NAME", "REMOTE_TYPE_NAME", "EMPLOYMENT_TYPE_NAME",
    "MIN_EDULEVELS_NAME"
]

# Keep only the columns that actually exist; warn about any missing
existing_cols = [c for c in eda_cols if c in df.columns]
missing_cols  = [c for c in eda_cols if c not in df.columns]
if missing_cols:
    print("⚠️ Missing columns (skipped):", missing_cols)

# Select and show a few rows
df_eda = df.select(*existing_cols)
print("Showing columns:", existing_cols)
df_eda.show(5, truncate=False)
Showing columns: ['SALARY', 'MIN_YEARS_EXPERIENCE', 'DURATION', 'COMPANY_IS_STAFFING', 'IS_INTERNSHIP', 'STATE_NAME', 'REMOTE_TYPE_NAME', 'EMPLOYMENT_TYPE_NAME', 'MIN_EDULEVELS_NAME']
+------+--------------------+--------+-------------------+-------------+----------+----------------+----------------------+-------------------+
|SALARY|MIN_YEARS_EXPERIENCE|DURATION|COMPANY_IS_STAFFING|IS_INTERNSHIP|STATE_NAME|REMOTE_TYPE_NAME|EMPLOYMENT_TYPE_NAME  |MIN_EDULEVELS_NAME |
+------+--------------------+--------+-------------------+-------------+----------+----------------+----------------------+-------------------+
|NULL  |2                   |6       |false              |false        |Arkansas  |[None]          |Full-time (> 32 hours)|Bachelor's degree  |
|NULL  |3                   |NULL    |true               |false        |Maine     |Remote          |Full-time (> 32 hours)|No Education Listed|
|NULL  |5                   |35      |false              |false        |Texas     |[None]          |Full-time (> 32 hours)|Bachelor's degree  |
|NULL  |3                   |48      |false              |false        |Arizona   |[None]          |Full-time (> 32 hours)|No Education Listed|
|92500 |NULL                |15      |false              |false        |California|[None]          |Part-time / full-time |No Education Listed|
+------+--------------------+--------+-------------------+-------------+----------+----------------+----------------------+-------------------+
only showing top 5 rows

3 Empty columns plot

# Missingness by column (nulls + blank strings) + Plotly bar chart

from pyspark.sql.functions import col, sum as spark_sum, when, trim, length
import pandas as pd
import plotly.express as px

# Use your EDA subset if it exists; otherwise use the full df
src = df_eda if 'df_eda' in globals() else df

# 1) Spark: count null OR blank-string cells per column
null_or_blank_sums = [
    spark_sum(
        when(col(c).isNull() | (length(trim(col(c).cast("string"))) == 0), 1).otherwise(0)
    ).alias(c)
    for c in src.columns
]
missing_df = src.select(null_or_blank_sums)

# 2) To pandas + percentages
missing_pd = missing_df.toPandas().T.reset_index()
missing_pd.columns = ["column", "missing_count"]
total_rows = src.count()
missing_pd["missing_pct"] = 100.0 * missing_pd["missing_count"] / max(total_rows, 1)
missing_pd = missing_pd.sort_values("missing_pct", ascending=False)

# 3) Plot
fig = px.bar(
    missing_pd,
    x="column",
    y="missing_pct",
    title="Percentage of Missing Values by Column",
    labels={"column": "Features", "missing_pct": "Missing (%)"}
)
fig.update_layout(xaxis_tickangle=-45, height=500, width=900)
fig
[Stage 77:>                                                         (0 + 1) / 1]                                                                                [Stage 80:>                                                         (0 + 1) / 1]                                                                                

4 Unique values

from pyspark.sql.functions import countDistinct

# use df_eda if it exists; else fall back to df
src = df_eda if 'df_eda' in globals() else df

src.select([countDistinct(c).alias(f"{c}_nunique") for c in src.columns]).show(truncate=False)
[Stage 83:>                                                         (0 + 1) / 1]
+--------------+----------------------------+----------------+---------------------------+---------------------+------------------+------------------------+----------------------------+--------------------------+
|SALARY_nunique|MIN_YEARS_EXPERIENCE_nunique|DURATION_nunique|COMPANY_IS_STAFFING_nunique|IS_INTERNSHIP_nunique|STATE_NAME_nunique|REMOTE_TYPE_NAME_nunique|EMPLOYMENT_TYPE_NAME_nunique|MIN_EDULEVELS_NAME_nunique|
+--------------+----------------------------+----------------+---------------------------+---------------------+------------------+------------------------+----------------------------+--------------------------+
|6052          |16                          |60              |2                          |2                    |51                |4                       |3                           |6                         |
+--------------+----------------------------+----------------+---------------------------+---------------------+------------------+------------------------+----------------------------+--------------------------+
                                                                                

5 Category counter

from pyspark.sql.functions import count, desc

# Use your EDA subset if available
src = df_eda if 'df_eda' in globals() else df

categorical_cols = [
    "STATE_NAME",
    "REMOTE_TYPE_NAME",
    "EMPLOYMENT_TYPE_NAME",
    "MIN_EDULEVELS_NAME",
    "COMPANY_IS_STAFFING",
    "IS_INTERNSHIP",
]

TOP_N = 50  # how many categories to show per column

for colname in categorical_cols:
    if colname not in src.columns:
        print(f"Skipping {colname} (column not found)")
        continue

    distinct_cnt = src.select(colname).distinct().count()
    print(f"\n--- {colname} (distinct={distinct_cnt}) ---")
    (
        src.groupBy(colname)
           .agg(count("*").alias("count"))
           .orderBy(desc("count"))
           .show(min(TOP_N, distinct_cnt), truncate=False)
    )
[Stage 89:>                                                         (0 + 1) / 1]                                                                                

--- STATE_NAME (distinct=52) ---
[Stage 95:>                                                         (0 + 1) / 1]                                                                                
+---------------------------------------+-----+
|STATE_NAME                             |count|
+---------------------------------------+-----+
|Texas                                  |8067 |
|California                             |7084 |
|Florida                                |3645 |
|Virginia                               |3636 |
|Illinois                               |3538 |
|New York                               |3341 |
|North Carolina                         |2747 |
|Georgia                                |2658 |
|Ohio                                   |2627 |
|New Jersey                             |2614 |
|Pennsylvania                           |2254 |
|Massachusetts                          |2057 |
|Michigan                               |1838 |
|Arizona                                |1638 |
|Washington                             |1624 |
|Minnesota                              |1476 |
|Colorado                               |1455 |
|Maryland                               |1360 |
|Tennessee                              |1274 |
|Missouri                               |1231 |
|Washington, D.C. (District of Columbia)|1224 |
|Oregon                                 |1090 |
|Wisconsin                              |1050 |
|Indiana                                |956  |
|Connecticut                            |863  |
|Kansas                                 |740  |
|Alabama                                |690  |
|South Carolina                         |645  |
|Utah                                   |639  |
|Kentucky                               |635  |
|Iowa                                   |625  |
|Oklahoma                               |606  |
|Nevada                                 |589  |
|Arkansas                               |584  |
|Nebraska                               |540  |
|Idaho                                  |478  |
|Mississippi                            |471  |
|Rhode Island                           |458  |
|Louisiana                              |456  |
|Delaware                               |438  |
|Maine                                  |345  |
|New Hampshire                          |295  |
|South Dakota                           |290  |
|Hawaii                                 |263  |
|New Mexico                             |255  |
|Alaska                                 |236  |
|Vermont                                |227  |
|Montana                                |186  |
|West Virginia                          |159  |
|North Dakota                           |149  |
+---------------------------------------+-----+
only showing top 50 rows
[Stage 98:>                                                         (0 + 1) / 1]                                                                                

--- REMOTE_TYPE_NAME (distinct=5) ---
[Stage 104:>                                                        (0 + 1) / 1]                                                                                
+----------------+-----+
|REMOTE_TYPE_NAME|count|
+----------------+-----+
|[None]          |56570|
|Remote          |12497|
|Hybrid Remote   |2260 |
|Not Remote      |1127 |
|NULL            |44   |
+----------------+-----+
[Stage 107:>                                                        (0 + 1) / 1]                                                                                

--- EMPLOYMENT_TYPE_NAME (distinct=4) ---
[Stage 113:>                                                        (0 + 1) / 1]                                                                                
+------------------------+-----+
|EMPLOYMENT_TYPE_NAME    |count|
+------------------------+-----+
|Full-time (> 32 hours)  |69176|
|Part-time (≤ 32 hours)|2298 |
|Part-time / full-time   |980  |
|NULL                    |44   |
+------------------------+-----+
[Stage 116:>                                                        (0 + 1) / 1]                                                                                

--- MIN_EDULEVELS_NAME (distinct=7) ---
[Stage 122:>                                                        (0 + 1) / 1]                                                                                
+----------------------------+-----+
|MIN_EDULEVELS_NAME          |count|
+----------------------------+-----+
|Bachelor's degree           |41471|
|No Education Listed         |22110|
|High school or GED          |3748 |
|Associate degree            |3141 |
|Master's degree             |1874 |
|Ph.D. or professional degree|110  |
|NULL                        |44   |
+----------------------------+-----+
[Stage 125:>                                                        (0 + 1) / 1]                                                                                

--- COMPANY_IS_STAFFING (distinct=3) ---
[Stage 131:>                                                        (0 + 1) / 1]                                                                                
+-------------------+-----+
|COMPANY_IS_STAFFING|count|
+-------------------+-----+
|false              |62808|
|true               |9646 |
|NULL               |44   |
+-------------------+-----+
[Stage 134:>                                                        (0 + 1) / 1]                                                                                

--- IS_INTERNSHIP (distinct=3) ---
[Stage 140:>                                                        (0 + 1) / 1]
+-------------+-----+
|IS_INTERNSHIP|count|
+-------------+-----+
|false        |70689|
|true         |1765 |
|NULL         |44   |
+-------------+-----+
                                                                                

6 Remote Type Name Fix

from pyspark.sql.functions import col, when, trim

# Start from df_eda if it exists; otherwise use df
base = df_eda if 'df_eda' in globals() else df

# Normalize REMOTE_TYPE_NAME values
df_eda = (
    base.withColumn(
        "REMOTE_TYPE_NAME",
        when(trim(col("REMOTE_TYPE_NAME")) == "Remote", "Remote")
        .when(trim(col("REMOTE_TYPE_NAME")) == "[None]", "Undefined")
        .when(trim(col("REMOTE_TYPE_NAME")) == "Not Remote", "On Premise")
        .when(trim(col("REMOTE_TYPE_NAME")) == "Hybrid Remote", "Hybrid")
        .when(col("REMOTE_TYPE_NAME").isNull(), "On Premise")
        .otherwise(col("REMOTE_TYPE_NAME"))
    )
)

# df_eda.createOrReplaceTempView("df_eda")  # ← uncomment if you want a SQL temp view

categorical_cols = [
   "REMOTE_TYPE_NAME",
    
]

for colname in categorical_cols:
    if colname not in df_eda.columns:
        print(f"⚠️ Skipping {colname} (column not found)")
        continue
    print(f"\n--- {colname} ---")
    df_eda.select(colname).distinct().show(50, truncate=False)

--- REMOTE_TYPE_NAME ---
[Stage 143:>                                                        (0 + 1) / 1]
+----------------+
|REMOTE_TYPE_NAME|
+----------------+
|Remote          |
|On Premise      |
|Hybrid          |
|Undefined       |
+----------------+
                                                                                

7 Employment type fix

# Normalize EMPLOYMENT_TYPE_NAME values and inspect them

from pyspark.sql.functions import col, when, trim, lower, regexp_replace, desc, count

# Start from df_eda if it exists; otherwise use df
base = df_eda if 'df_eda' in globals() else df

# A normalized helper (lowercased, trimmed, single-spaced) for matching
emp_norm = regexp_replace(lower(trim(col("EMPLOYMENT_TYPE_NAME"))), r"\s+", " ")

df_eda = (
    base.withColumn(
        "EMPLOYMENT_TYPE_NAME",
        when(col("EMPLOYMENT_TYPE_NAME").isNull(), "Fulltime")                                   # NULL → Fulltime
        .when(emp_norm.isin("part-time / full-time", "part time / full time", "part-time/full-time"),
              "Flexible")                                                                        # "Part-time / full-time" → Flexible
        .when(emp_norm.rlike(r"part[- ]?time\s*\((<=|)\s*32\s*hours\)"),
              "Parttime")                                                                        # "Part-time (<=/≤ 32 hours)" → Parttime
        .when(emp_norm.rlike(r"full[- ]?time\s*\(>\s*32\s*hours\)"),
              "Fulltime")                                                                        # "Full-time (> 32 hours)" → Fulltime
        .otherwise(col("EMPLOYMENT_TYPE_NAME"))                                                  # keep others as-is
    )
)

# --- Option A: just distinct values (like your screenshot) ---
print("\n--- EMPLOYMENT_TYPE_NAME (distinct) ---")
df_eda.select("EMPLOYMENT_TYPE_NAME").distinct().show(50, truncate=False)

# --- Option B: frequency table (more informative) ---
print("\n--- EMPLOYMENT_TYPE_NAME (counts) ---")
(df_eda.groupBy("EMPLOYMENT_TYPE_NAME")
       .agg(count("*").alias("count"))
       .orderBy(desc("count"))
       .show(truncate=False))

--- EMPLOYMENT_TYPE_NAME (distinct) ---
[Stage 146:>                                                        (0 + 1) / 1]                                                                                
+------------------------+
|EMPLOYMENT_TYPE_NAME    |
+------------------------+
|Flexible                |
|Part-time (≤ 32 hours)|
|Fulltime                |
+------------------------+


--- EMPLOYMENT_TYPE_NAME (counts) ---
[Stage 149:>                                                        (0 + 1) / 1]
+------------------------+-----+
|EMPLOYMENT_TYPE_NAME    |count|
+------------------------+-----+
|Fulltime                |69220|
|Part-time (≤ 32 hours)|2298 |
|Flexible                |980  |
+------------------------+-----+